This problem comes under the purvue of supervised machine learning. The label that we want to predict is a continuous number thus, it's a regression problem.
import math
import pandas as pd
import seaborn as sns
import os
import numpy as np
import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import set_config
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
from wordcloud import WordCloud
set_config(display="diagram")
#%matplotlib inline
### setting path for training and test data
train_dir='dataset/train/'
train_files = [f'{train_dir}{j}' for j in os.listdir(train_dir)]
test_dir = 'dataset/test/'
test_files = [ f'{test_dir}{j}' for j in os.listdir(test_dir)]
### This function can help in combining data from different files into one dataframe
def getDataFrame(file_list: list) -> pd.DataFrame:
'''
This function will combine all the datafiles into one dataframe each for training and test data
### Parameters:
file_list(list): list containing all file names
### Return:
df(dataframe): dataframe containing entire dataset
'''
df = pd.DataFrame()
for j in file_list:
df = pd.concat([df,pd.read_fwf(j,header=None,index_col=[0]).iloc[:-1,].T])
return df
## function to load the data
def loadData(file_name1: str, file_name2: str) :
'''
This funciton will load the data if the file exists otherwise will generate
the data file load them and save them.
### Parameters:
file_name1(str): training data file_name to search for if you want to load the data
file_name2(str): testing data file_name to search for
### Return:
df1(dataframe): dataframe containing the test data
df2(dataframe): dataframe continaing the training data
'''
if glob.glob('train.csv'):
train_df = pd.read_csv('train.csv',index_col=[0])
else:
train_df = pd.DataFrame()
train_df = pd.concat([train_df,getDataFrame(file_name1[:10000])],ignore_index=True)
train_df = pd.concat([train_df,getDataFrame(file_name1[10000:20000])],ignore_index=True)
train_df = pd.concat([train_df,getDataFrame(file_name1[20000:30000])],ignore_index=True)
train_df = pd.concat([train_df,getDataFrame(file_name1[30000:40000])],ignore_index=True)
train_df = pd.concat([train_df, getDataFrame(
file_name1[40000:])], ignore_index=True)
train_df.to_csv('train.csv',index=[0])
if glob.glob('test.csv') :
test_df = pd.read_csv('test.csv',index_col=[0])
else:
test_df = getDataFrame(file_name2)
test_df.to_csv('test.csv', index=[0])
return train_df,test_df
### Convert the datatype of features
def edaData(df1: pd.DataFrame, df2: pd.Series) -> pd.DataFrame:
'''
This function will create the data for exploratory purposes by combining
the features as well as the predictors.
### Parameters:
df1(pd.DataFrame): dataframe containing all the features
df2(pd.Series): dataframe containing all the labels.
### Return:
tempdf (pd.DataFrame): dataframe for eda
'''
return pd.concat([df1, df2], axis=1)
train_df,test_df = loadData(train_files,test_files)
test_size = test_df.shape[0]/train_df.shape[0] # test data size
## quantity that we want to predict
predictor='Time_taken (min)'
X_train,X_test,y_train,y_test = train_test_split(train_df.drop([predictor],axis=1),
train_df[predictor],test_size=test_size,random_state=42)
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 34194 entries, 21366 to 15795 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 34194 non-null object 1 Delivery_person_ID 34194 non-null object 2 Delivery_person_Age 32775 non-null float64 3 Delivery_person_Ratings 32736 non-null float64 4 Restaurant_latitude 34194 non-null float64 5 Restaurant_longitude 34194 non-null float64 6 Delivery_location_latitude 34194 non-null float64 7 Delivery_location_longitude 34194 non-null float64 8 Order_Date 34194 non-null object 9 Time_Orderd 32870 non-null object 10 Time_Order_picked 34194 non-null object 11 Weather conditions 33734 non-null object 12 Road_traffic_density 33742 non-null object 13 Vehicle_condition 34194 non-null int64 14 Type_of_order 34194 non-null object 15 Type_of_vehicle 34194 non-null object 16 multiple_deliveries 33426 non-null float64 17 Festival 34024 non-null object 18 City 33285 non-null object dtypes: float64(7), int64(1), object(11) memory usage: 5.2+ MB
| Date time | Floats | Category |
| Delivery_person_ID | ||
| Delivery_person_Age | ||
| Delivery_person_Ratings | ||
| Restaurant_lattitude | ||
| Restaurant_longitude | ||
| Delivery_location_lattitude | ||
| Delivery_location_longitude | ||
| Order_Date | ||
| Time_Ordered | ||
| Time_Order_Picked | ||
| Weather conditions | ||
| Road_traffic_density | ||
| Vechicle_condition | ||
| Type_of_order | ||
| Type_of_vehicle | ||
| Multiple_deliveries | ||
| festival | ||
| City |
Before we start the job of modelling the data and prediction we need to convert features into appropriate data type.
## function to covert the features in to a suitable datatype
def convertDatatype(df: pd.DataFrame, col: str, newdt: str):
'''
This function will convert the datatype of a given column in the dataframe
### Parameters:
df(dataframe): dataframe that we want to update
col(string): name of the column that we want to updata
newdt(string): updated datatype of the column
### Return:
df(dataframe): update dataframe
'''
df[col] = df[col].astype(newdt)
return df
def dfConvert(df: pd.DataFrame):
'''
This function will convert the datatypes of columns from the dataframe
### Parameters:
df(dataframe): dataframe that we want to operate upon
### Return:
df(dataframe): converted dataframe
'''
### features that are converted into float
df = convertDatatype(df, 'Delivery_person_Age', 'float64')
df = convertDatatype(df, 'Delivery_person_Ratings', 'float64')
df = convertDatatype(df, 'Restaurant_latitude', 'float64')
df = convertDatatype(df, 'Restaurant_longitude', 'float64')
df = convertDatatype(df, 'Delivery_location_latitude', 'float64')
df = convertDatatype(df, 'Delivery_location_longitude', 'float64')
df = convertDatatype(df,'multiple_deliveries','float')
### features that are converted into category
df = convertDatatype(df,"Delivery_person_ID",'category')
df = convertDatatype(df,"Weather conditions",'category')
df = convertDatatype(df,"Road_traffic_density",'category')
df = convertDatatype(df,"Vehicle_condition",'category')
df = convertDatatype(df, "Type_of_order", 'category')
df = convertDatatype(df,"Type_of_vehicle",'category')
df = convertDatatype(df, "Festival", 'category')
df = convertDatatype(df, "City",'category')
return df
def convertallDf(df1:pd.DataFrame , df2: pd.DataFrame ,df3: pd.DataFrame):
'''
This function will convert all the dataframe datatype
### Parameters:
df1(dataframe): training data
df2(dataframe): validataion data
df3(dataframe): testing data
'''
df1 = dfConvert(df1)
df2 = dfConvert(df2)
df3 = dfConvert(df3)
return df1,df2,df3
X_train, X_test, test_df = convertallDf(X_train, X_test, test_df)
eda_data = edaData(X_train, y_train)
temp_tr = eda_data.isna().sum()
temp_test = test_df.isna().sum()
missing_data = pd.concat(
[temp_tr[temp_tr.values != 0], temp_test[temp_test.values != 0]], axis=1)
missing_data.columns = ['training_data', 'testing_data']
missing_data.reset_index(inplace=True)
missing_data.columns = ['Features', 'training_data', 'testing_data']
missing_data.set_index(keys='Features', inplace=True)
f, ax = plt.subplots(1, 2, figsize=(8, 4))
a0 = sns.barplot(y=missing_data.index, x='training_data',
data=missing_data, ax=ax[0])
a1 = sns.barplot(y=missing_data.index, x='testing_data',
data=missing_data, ax=ax[1])
a1.yaxis.set_label_position("right")
a1.yaxis.tick_right()
plt.title(label='Missing values in the training and testing data',
fontsize=16, loc="right")
plt.show()
def wordcloudmap(text,rs):
'''
This function will form word cloud given text corpus
### Parameters:
text(corpus): string containing all the strings
### Return:
None
'''
word_cloud = WordCloud(width=800,height=800,
collocations=False, background_color='white',
prefer_horizontal=0.7,max_font_size=40,
relative_scaling=rs,max_words=90).generate(text)
f=plt.figure(figsize=(8,8))
plt.imshow(word_cloud,interpolation='bilinear')
plt.axis("off")
plt.show()
wordcloudmap(" ".join(eda_data.Delivery_person_ID),rs=1)
num_cols = list(eda_data.select_dtypes(include=['float64']).columns)
f,ax = plt.subplots(3,3,figsize=(16,12))
for i,j in enumerate(num_cols):
sns.histplot(x=j,data=eda_data,stat='density',ax=ax[i%3,i//3])
f.delaxes(ax[1,2])
f.delaxes(ax[2,2])
plt.show()
f, ax = plt.subplots(1, 2, figsize=(8, 4))
sns.scatterplot(data=X_train, x='Restaurant_latitude',
y='Restaurant_longitude', alpha=0.1, ax=ax[0])
sns.scatterplot(data=X_train, x='Delivery_location_latitude',
y='Delivery_location_longitude', alpha=0.1, ax=ax[1])
plt.show()
fig = px.scatter_geo(eda_data, lat='Restaurant_latitude',
lon='Restaurant_longitude',projection="natural earth")
fig.show()
fig = px.scatter_geo(eda_data, lat='Delivery_location_latitude',
lon='Delivery_location_longitude',projection="natural earth")
fig.show()
fig = px.scatter_geo(eda_data[eda_data.Delivery_location_latitude < 0.5], lat='Delivery_location_latitude',
lon='Delivery_location_longitude', projection="natural earth")
fig.show()
fig = px.scatter_geo(eda_data[eda_data.Restaurant_latitude < 0.1], lat='Restaurant_latitude',
lon='Restaurant_longitude', projection="natural earth")
fig.show()
It seems like the issue regarding the restaurant location can be caused by the incorrect sign in front of the latitude and longitude data. One can just use an absolute value of the data point
eda_data2 = eda_data.iloc[:,[4,5]].apply(abs)
fig = px.scatter_geo(eda_data2, lat='Restaurant_latitude',
lon='Restaurant_longitude', projection="natural earth")
fig.show()
cat_cols = eda_data.select_dtypes(exclude='float').columns[5:]
cat_cols
Index(['Weather conditions', 'Road_traffic_density', 'Vehicle_condition',
'Type_of_order', 'Type_of_vehicle', 'Festival', 'City'],
dtype='object')
f,ax=plt.subplots(3,3,figsize=(16,16))
for i,j in enumerate(cat_cols):
sns.boxplot(data=eda_data, y='Time_taken (min)',
x=j, ax=ax[i%3,i//3])
plt.delaxes(ax[1,2])
plt.delaxes(ax[2,2])
plt.show()
cols = list(X_train.columns)
new_cols = cols.copy()
cols_remove = [0,1,8,9,10]
for j in cols_remove[-1::-1]:
new_cols.remove(cols[j])
### Replace values in the location data
def clean_locationData(df: pd.DataFrame, thrs1: float, thrs2: float, thrs3: float, thrs4: float) -> pd.DataFrame:
'''
This function will take the data and clean the data for the location.
1. Replace the zeros with Nan for imputation
2. Replace the data points from their absolute values
### Parameters:
df(dataframe) : dataframe containing the location data.
thrs1(float): threshold value to declare nan in the restaurant latitude data
thrs2(float): threshold value to declare nan in the restaurant longitude data
thrs3(float): threshold value to declare nan in the Delivery latitude data
thrs4(float): threshold value to declare nan in the delivery longitude data
### Return:
None
'''
df['Restaurant_latitude'] = df['Restaurant_latitude'].apply(abs)
df['Restaurant_longitude'] = df['Restaurant_longitude'].apply(abs)
df['Delivery_location_latitude'] = df['Delivery_location_latitude'].apply(
abs)
df['Delivery_location_longitude'] = df['Delivery_location_longitude'].apply(
abs)
locs, = np.where(df.Restaurant_latitude < thrs1)
df.iloc[locs, 4] = np.nan
locs, = np.where(df.Restaurant_longitude < thrs2)
df.iloc[locs, 5] = np.nan
locs, = np.where(df.Delivery_location_latitude < thrs3)
df.iloc[locs, 6] = np.nan
locs, = np.where(df.Delivery_location_longitude < thrs4)
df.iloc[locs, 7] = np.nan
return df
def create_pipeline(num_col: list, obj_col: list, clf):
'''
This function creates a pipeline for the features as well as the model.
1. num_pipe, cat_pipe: Seprate pipelines for numerical and categorical features
2. feature_pipe: Two pipelines are combined using ColumnTransformers
3. ml_pipe: combining feature and model pipelines
### Parameters:
num_col(list): list containing numerical features
obj_col(list): list containing categorical features
clf(classifier): classifier that we want to use for prediction
### Return:
ml_pipe(pipeline): pipeline combining data and model
'''
## pipeline for numerical features
num_pipe = Pipeline([
('impute',SimpleImputer(strategy='median')),
#|('scale',StandardScaler())
('scale', MinMaxScaler())
])
## pipeline for categorical features
cat_pipe = Pipeline([
('impute',SimpleImputer(strategy='most_frequent')),
('transf',OneHotEncoder(handle_unknown='ignore'))
])
## combining categorical and numerical features
feature_pipe = ColumnTransformer(transformers=[
('numf',num_pipe,num_col),
('catf',cat_pipe,obj_col)
])
## ml pipeline combining features and model
ml_pipe = Pipeline(
[('datapipe',feature_pipe),
('pf',PolynomialFeatures(interaction_only=True)),
('mlpipe',clf)
])
return ml_pipe
def output_file(ml_pipe,test: pd.DataFrame,clf_name: str):
'''
This function will generate the output file and save it for the submission purposes
### Parameters:
ml_pipe(pipeline): machine learning pipeline
test(dataframe): dataframe containing the test data
clf_name(string): name of the model that is used
### Return :
None
'''
ans = ml_pipe.predict(test)
pd.DataFrame({'ID': test.ID, 'Time_taken (min)': ans}).to_csv(
f'ans{clf_name}.csv', index=False)
### function to run a base ml routine. Using the data, model and features
def runMlBAse(train_data: pd.DataFrame, y_train, test_data: pd.DataFrame,
val_data: pd.DataFrame, y_val, clf, clf_name: str, features: list) -> Pipeline:
'''
This function will train the base model given the data and output the performance
metrics and also the predictions for the test data.
### Parameters:
train_data(dataframe): dataframe containing the training data
y_train: label for the training data
test_data(dataframe): testing data
val_data(dataframe): validataion data
y_val(array): label for the validation data
clf(ml model): ml model that we want to fit to the data
clf_name(str): name of the ml model to name the output file
features(list): list of features
### Return:
ml_pipe (pipeline): constructed pipeline
'''
train_data, val_data, test_data = convertallDf(
train_data, val_data, test_data)
train_data = clean_locationData(train_data, 1.0, 1.0, 1.0, 1.0)
val_data = clean_locationData(val_data, 1.0, 1.0, 1.0, 1.0)
test_data = clean_locationData(test_data, 1.0, 1.0, 1.0, 1.0)
num_col = list(train_data[features].select_dtypes(
exclude=['category','object']).columns)
obj_col = list(train_data[features].select_dtypes(
include=['category','object']).columns)[1:]
print(len(num_col)+len(obj_col))
ml_pipe = create_pipeline(num_col, obj_col, clf)
ml_pipe.fit(train_data, y_train)
trn_score = r2_score(y_train, ml_pipe.predict(train_data))
val_score = r2_score(y_val, ml_pipe.predict(val_data))
print(f'Training score {trn_score}, Validation score{val_score}')
output_file(ml_pipe, test_data, clf_name)
return ml_pipe
features = X_train.columns
drop_features = ['ID', 'Delivery_person_ID',
'Order_Date', 'Time_Orderd', 'Time_Order_picked']
for j in drop_features:
features = features.drop(j)
clf_name = 'gbr'
gb = GradientBoostingRegressor(learning_rate=0.1,
n_estimators=100, max_depth=5, random_state=42)
## create the pipeline and fit it to the training data,testing data and validation data
ml_pipe_gb = runMlBAse(X_train, y_train, test_df, X_test,
y_test, gb, clf_name, features)
13
Using existing information to generate more features.
def deliveryPersonRES(df: pd.DataFrame) -> pd.DataFrame:
'''
This function will create a new features based on the resident
state of the Delivery guy.
### Parameters:
df(dataframe): dataframe containing the data
### Return:
df(dataframe): dataframe with new feature columns
'''
df['deliveryRES1'] = df.Delivery_person_ID.str.split(
'RES').apply(lambda x: x[0])
df['deliveryCode'] = df.Delivery_person_ID.str.split(
'RES').apply(lambda x: x[1]).str.split('DEL').apply(lambda x: f'{x[0]}{x[1]}')
df['deliveryRES2'] = df.Delivery_person_ID.str.split(
'RES').apply(lambda x: x[1])
return df
def convertDatetime(df: pd.DataFrame,col: str)->pd.DataFrame:
'''
This function will convert the date time data represented by
col into pandas date-time format
### Parameters:
df(dataframe): dataframe containing the data
col(str): column name
### Return:
df(dataframe): data frame with correct date-time format
'''
df[col] = pd.to_datetime(df[col])
return df
def date_time(df: pd.DataFrame) -> pd.DataFrame:
'''
This function will take the dataframe and will first convert the Order_date into
a pandas datetime object. Upon doing that it will extract features: like month,
week, day_name, the weekday.
### Parameters:
df(dataframe): dataframe containing the datetime object
### Return:
df(dataframe): dataframe with new date features
'''
df = convertDatetime(df,'Order_Date')
df['Order_month'] = df['Order_Date'].dt.month
df['week'] = df['Order_Date'].dt.isocalendar().week
df['Order_weekday'] = df['Order_Date'].dt.weekday
return df
def convertTime(x):
'''
This function will round the time such that the time is in correct format
### Parameters:
### Return:
df(dataframe): dataframe with correct time
'''
## getting hours and mintues from time element
hr, mnts = map(int, x)
## this function will return 1 if mnts >=60 and mnts if <=60
mnts1, mnts2 = divmod(mnts, 60) # returns x//y,x%y
hr1, hr2 = divmod(hr+mnts1, 24) # return n%24 if n>24 else return n
new_time = f'{hr2}:{mnts2}'
return new_time
def dateTimeFeatures(df: pd.DataFrame) -> pd.DataFrame:
df = date_time(df)
tempdt = pd.to_datetime(df.Time_Order_picked.str.split(':').apply(convertTime),
format='%H:%M')
df['Order_pickup_hour_mins'] = tempdt.dt.time
df['Pickup_hour'] = tempdt.dt.hour
df['Pickup_minute'] = tempdt.dt.minute
return df
def distanceMetric(df: pd.DataFrame) -> pd.DataFrame:
'''
This function will use the longitude and latitude to calculate the
metric of distance to create new features.
### Parameters:
df(dataframe): dataframe containing the data
### Return:
df(dataframe): dataframe with new feature
'''
df['havershine_m'] = pd.Series(havershine(
df.iloc[:, 4:8])).apply(lambda x: round(x, 3))
df['havershine_km'] = df['havershine_m'].apply(
lambda x: x/1000).apply(lambda x: round(x, 3))
df['havershine_m'] = df['havershine_m'].apply(lambda x: round(x, 3))
return df
def havershine(df: pd.DataFrame):
'''
Calculate the havershine distance between two point of given latitude and longitude
### Parameters:
df(dataframe): dataframe containing the latitude and longitude distance
### Return:
df(dataframe): dataframe with new havershine distance
'''
radius_earth = 6371000
rlat = np.radians(df['Restaurant_latitude'].values)
rlon = np.radians(df['Restaurant_longitude'].values)
dlat = np.radians(df['Delivery_location_latitude'].values)
dlon = np.radians(df['Delivery_location_longitude'].values)
dellat = rlat - dlat
dellon = rlon - dlon
a = (np.sin(0.5*dellat))**2 + np.cos(rlat) * \
np.cos(dlat) * (np.sin(0.5*dellon))**2
c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
dist = radius_earth*c
return dist
def dataprep(train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame):
'''
This function will take the data and perform all the transformaion in the data
1. Converting features into correct datatype
2. Cleaning the data (location data is not correct).
3. Removing unnecessary value
4. Feature engineering.
1. Features for delivery agent.
1. Features regarding order date-time.
1. Creating the distance feature using longitude and latitude.
### Parameters:
train_data(pd.DataFrame): training data
val_data(pd.DataFrame): validation data
test_data(pd.DataFrame): testing data
### Return
train_data(pd.DataFrame): processed training data
val_data(pd.DataFrame): processed validation data
test_data(pd.DataFrame): processed testing data
'''
train_data, val_data, test_data = convertallDf(
train_data, val_data, test_data)
train_data = clean_locationData(train_data, 1.0, 1.0, 1.0, 1.0)
val_data = clean_locationData(val_data, 1.0, 1.0, 1.0, 1.0)
test_data = clean_locationData(test_data, 1.0, 1.0, 1.0, 1.0)
train_data = deliveryPersonRES(train_data)
val_data = deliveryPersonRES(val_data)
test_data = deliveryPersonRES(test_data)
train_data = dateTimeFeatures(train_data)
val_data = dateTimeFeatures(val_data)
test_data = dateTimeFeatures(test_data)
train_data = distanceMetric(train_data)
val_data = distanceMetric(val_data)
test_data = distanceMetric(test_data)
return train_data, val_data, test_data
train_data,val_data,test_data = dataprep(X_train,X_test,test_df)
eda_data = edaData(train_data, y_train)
### Resident information
text = " ".join(eda_data.deliveryRES1)
word_cloud = WordCloud(scale=0.5,
collocations=False, background_color='black',
prefer_horizontal=0.7, max_font_size=30, colormap='winter',
max_words=90).generate(text)
f = plt.figure(figsize=(8, 8))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
### Delivery agent ID code information
wordcloudmap(" ".join(eda_data.deliveryRES2), rs=0.3)
def multiplot(df: pd.DataFrame, col1: str, col2: str,label: str):
'''
This function will plot the box plot of the data given the dataframe,
feature names and the label.
### Parameters:
df(pd.DataFrame): dataframe containing the data
col1(str): feature name
col2(str): feature name
label(str): label that we want to plot
### Return:
None
'''
f,ax=plt.subplots(1,2,figsize=(12,6))
sns.boxplot(data=df,x=col1,y=label,ax=ax[0])
sns.boxplot(data=df,x=col2,y=label,ax=ax[1])
plt.show()
multiplot(eda_data,'Order_month','week','Time_taken (min)')
sns.catplot(data=eda_data, x='Order_weekday',y='Time_taken (min)',kind='box')
plt.show()
multiplot(eda_data, 'Pickup_hour', 'Pickup_minute', 'Time_taken (min)')
sns.scatterplot(data = eda_data,x='havershine_km',y='Time_taken (min)',
alpha=0.5,x_jitter=1.2)
plt.show()
drop_cols = ['ID', 'Delivery_person_ID', 'Order_Date',
'Time_Orderd', 'Time_Order_picked', 'Order_pickup_hour_mins','deliveryCode', 'deliveryRES2','havershine_m']
feature_list = list(eda_data.columns[:-1])
for j in drop_cols:
feature_list.remove(j)
### function to run a base ml routine. Using the data, model and features
def runMLNewFeature(train_data: pd.DataFrame, y_train, test_data: pd.DataFrame,
val_data: pd.DataFrame, y_val, clf, clf_name: str, features: list) -> Pipeline:
'''
This function will train the base model given the data and output the performance
metrics and also the predictions for the test data.
### Parameters:
train_data(dataframe): dataframe containing the training data
y_train: label for the training data
test_data(dataframe): testing data
val_data(dataframe): validataion data
y_val(array): label for the validation data
clf(ml model): ml model that we want to fit to the data
clf_name(str): name of the ml model to name the output file
features(list): list of features
### Return:
ml_pipe (pipeline): constructed pipeline
'''
train_data,val_data,test_data = dataprep(train_data,
val_data,test_data)
num_col = list(train_data[features].select_dtypes(
exclude=['category','object']).columns)
obj_col = list(train_data[features].select_dtypes(
include=['category','object']).columns)[1:]
ml_pipe = create_pipeline(num_col, obj_col, clf)
ml_pipe.fit(train_data, y_train)
trn_score = r2_score(y_train, ml_pipe.predict(train_data))
val_score = r2_score(y_val, ml_pipe.predict(val_data))
print(f'Training score {trn_score}, Validation score{val_score}')
output_file(ml_pipe, test_data,clf_name)
return ml_pipe
clf_name = 'gbr'
gb = GradientBoostingRegressor(learning_rate=0.1,
n_estimators=100, max_depth=6, random_state=42)
ml_pipe = runMLNewFeature(X_train, y_train, test_df,
X_test, y_test, gb, clf_name, feature_list)
Training score 0.7050687853476514, Validation score0.6564859098802589
Using the data given and the tools in hand one can train a simple gradient regressor to predict the label. The predictions were made using the historical data. One can employ combination of hyperparameter tuning, ensembling and stacking to improve upon the results obtained here.